geneExp_T=read.csv(file = "../TCGA_BRCA_Tumor_RSEM_GeneExp.tsv", header = T, sep = "\t", as.is = T, row.names = NULL)
geneExp_T[1:5,1:5]
## row.names TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## 1 TSPAN6 4.4098524 11.52828776
## 2 TNMD 0.1194379 0.04438564
## 3 DPM1 29.6726970 28.00414215
## 4 SCYL3 6.5723377 6.70692039
## 5 C1orf112 2.8762074 4.81435840
## TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## 1 6.27288098 17.8241862
## 2 0.04878317 0.6663316
## 3 57.36674446 56.1109712
## 4 3.08093040 7.1605879
## 5 4.84118390 2.9625904
geneExp_T_m=as.matrix(geneExp_T[,2:ncol(geneExp_T)])
rownames(geneExp_T_m)=geneExp_T$row.names
#Now it is a matrix format
geneExp_T_m[1:5,1:5]
## TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6 4.4098524 11.52828776
## TNMD 0.1194379 0.04438564
## DPM1 29.6726970 28.00414215
## SCYL3 6.5723377 6.70692039
## C1orf112 2.8762074 4.81435840
## TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6 6.27288098 17.8241862
## TNMD 0.04878317 0.6663316
## DPM1 57.36674446 56.1109712
## SCYL3 3.08093040 7.1605879
## C1orf112 4.84118390 2.9625904
## TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6 23.008325
## TNMD 0.528638
## DPM1 24.222269
## SCYL3 6.241249
## C1orf112 2.429812
#Size of matrix
dim(geneExp_T_m)
## [1] 56499 1102
#Filter out Protein-coding genes
ENSEMBL_annotations=read.csv("Homo_sapiens.GRCh37.87_Annotations.txt",
header=FALSE,sep=" ",stringsAsFactors = F)
ENSEMBL_annotations_prot=unique(ENSEMBL_annotations[ENSEMBL_annotations$V3=="protein_coding",])
geneExp_T_m_prot=subset(geneExp_T_m,rownames(geneExp_T_m) %in% ENSEMBL_annotations_prot$V2)
#Now we have gene matrix (rows) consisting of protein-coding ONLY
dim(geneExp_T_m_prot)
## [1] 18390 1102
head(geneExp_T_m_prot)[,1:5]
## TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6 4.4098524 11.52828776
## TNMD 0.1194379 0.04438564
## DPM1 29.6726970 28.00414215
## SCYL3 6.5723377 6.70692039
## C1orf112 2.8762074 4.81435840
## FGR 1.7643648 1.36997556
## TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6 6.27288098 17.8241862
## TNMD 0.04878317 0.6663316
## DPM1 57.36674446 56.1109712
## SCYL3 3.08093040 7.1605879
## C1orf112 4.84118390 2.9625904
## FGR 5.60683479 6.1761304
## TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6 23.008325
## TNMD 0.528638
## DPM1 24.222269
## SCYL3 6.241249
## C1orf112 2.429812
## FGR 1.727848
#Remove <2 TPM genes; Only retain rows (genes) that show atleast 20% of columns (patients) have TPM values > 2.
TCGA_only_f1=geneExp_T_m_prot
TCGA_only_f1=geneExp_T_m_prot[(rowSums(geneExp_T_m_prot>2)>=as.integer(0.2*ncol(geneExp_T_m_prot))),]
dim(TCGA_only_f1)
## [1] 11116 1102
TCGA_only_f1[1:5,1:5]
## TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6 4.409852 11.528288
## DPM1 29.672697 28.004142
## SCYL3 6.572338 6.706920
## C1orf112 2.876207 4.814358
## FGR 1.764365 1.369976
## TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6 6.272881 17.824186
## DPM1 57.366744 56.110971
## SCYL3 3.080930 7.160588
## C1orf112 4.841184 2.962590
## FGR 5.606835 6.176130
## TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6 23.008325
## DPM1 24.222269
## SCYL3 6.241249
## C1orf112 2.429812
## FGR 1.727848
#Quantile Normalize data
TCGA_only_f1_prot_QN=normalize.quantiles(TCGA_only_f1)
row.names(TCGA_only_f1_prot_QN)=row.names(TCGA_only_f1)
colnames(TCGA_only_f1_prot_QN)=colnames(TCGA_only_f1)
TCGA_only_f1_prot_QN[1:5,1:5]
## TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6 3.728963 10.173951
## DPM1 28.178291 27.724906
## SCYL3 5.591500 5.609570
## C1orf112 2.555565 4.040250
## FGR 1.683587 1.153433
## TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6 6.499250 16.581872
## DPM1 48.766935 55.987137
## SCYL3 3.536875 5.977064
## C1orf112 5.203306 2.197796
## FGR 5.906148 5.035973
## TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6 24.185118
## DPM1 25.407640
## SCYL3 6.108447
## C1orf112 2.168783
## FGR 1.408451
#Log2 transform data
TCGA_only_f1_prot_QN_l2=log2(TCGA_only_f1_prot_QN+1)
TCGA_only_f1_prot_QN_l2[1:5,1:5]
## TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6 2.241524 3.482067
## DPM1 4.866823 4.844230
## SCYL3 2.720607 2.724556
## C1orf112 1.830079 2.333495
## FGR 1.424163 1.106638
## TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6 2.906746 4.136017
## DPM1 5.637116 5.832564
## SCYL3 2.181699 2.802620
## C1orf112 2.633037 1.677078
## FGR 2.787881 2.593586
## TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6 4.654500
## DPM1 4.722883
## SCYL3 2.829534
## C1orf112 1.663929
## FGR 1.268105
#Mean-normalize matrix
TCGA_only_f1_prot_QN_l2_m=TCGA_only_f1_prot_QN_l2-apply(TCGA_only_f1_prot_QN_l2,1,mean)
TCGA_only_f1_prot_QN_l2_m[1:5,1:5]
## TCGA.BH.A0W3.01A.11R.A109.07 TCGA.A8.A07F.01A.11R.A00Z.07
## TSPAN6 -1.1274625 0.1130812
## DPM1 -0.1703116 -0.1929048
## SCYL3 0.2234674 0.2274169
## C1orf112 0.1588020 0.6622186
## FGR -0.3458319 -0.6633564
## TCGA.GM.A3XL.01A.11R.A22U.07 TCGA.LL.A441.01A.11R.A24H.07
## TSPAN6 -0.4622400 0.767030468
## DPM1 0.5999805 0.795429314
## SCYL3 -0.3154405 0.305480522
## C1orf112 0.9617604 0.005801225
## FGR 1.0178868 0.823591895
## TCGA.BH.A0HP.01A.12R.A084.07
## TSPAN6 1.285513333
## DPM1 -0.314251637
## SCYL3 0.332394828
## C1orf112 -0.007348134
## FGR -0.501889099
#Get top5K variable genes
var_TCGA_only=apply(TCGA_only_f1_prot_QN_l2, 1, var)
TCGA_only_f1_prot_QN_l2_5K=TCGA_only_f1_prot_QN_l2[order(var_TCGA_only, decreasing=TRUE)[1:5000],]
TCGA_only_f1_prot_QN_l2_m_5K=TCGA_only_f1_prot_QN_l2_m[order(var_TCGA_only, decreasing=TRUE)[1:5000],]
#Most variable genes (top 25)
sort(var_TCGA_only, decreasing=TRUE)[1:25]
## SCGB2A2 SCGB1D2 TFF1 PIP CPB1 MUCL1 CLEC3A
## 15.771835 13.297163 12.522298 11.892873 10.766111 10.256921 9.396407
## LTF CALML5 S100A7 TFF3 AGR3 AGR2 FDCSP
## 8.803288 8.641087 8.517375 8.410280 7.886337 7.539050 7.339040
## CEACAM6 S100A9 CYP4Z1 NPY1R BMPR1B KRT14 KRT5
## 6.995001 6.978111 6.907518 6.894458 6.651942 6.390517 5.884929
## GABRP KRT17 GFRA1 KRT81
## 5.883652 5.836392 5.752835 5.712550
Supervised k-means clustering is used here. Again the goal is NOT to determine the number of row or column clusters here.
#Use the top5K features (genes) which normalized by its mean value for each row and visualize as a Heatmap
#Kmeans here
Heatmap(TCGA_only_f1_prot_QN_l2_m_5K,
col = colorRamp2(c(-2, 0, 2),c("purple", "#EEEEEE", "yellow")),
show_column_names = F, show_row_names = F,
cluster_columns = T,cluster_rows = T,
show_column_dend = T, show_row_dend = T,
clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
show_heatmap_legend =T,
row_km = 8, column_km = 4
)
Unsupervised Hierarchical clustering is used here
#Hierarchical and unsupervised clustering
Heatmap(TCGA_only_f1_prot_QN_l2_m_5K,
col = colorRamp2(c(-2, 0, 2),c("blue", "#EEEEEE", "orange")),
show_column_names = F, show_row_names = F,
cluster_columns = T,cluster_rows = T,
show_column_dend = T, show_row_dend = T,
clustering_distance_rows = "pearson", clustering_distance_columns = "pearson",
show_heatmap_legend =T
)